In [914]:
!pip install arch
Requirement already satisfied: arch in /usr/local/lib/python3.7/dist-packages (5.1.0)
Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.7/dist-packages (from arch) (1.19.5)
Requirement already satisfied: property-cached>=1.6.4 in /usr/local/lib/python3.7/dist-packages (from arch) (1.6.4)
Requirement already satisfied: statsmodels>=0.11 in /usr/local/lib/python3.7/dist-packages (from arch) (0.13.1)
Requirement already satisfied: scipy>=1.3 in /usr/local/lib/python3.7/dist-packages (from arch) (1.4.1)
Requirement already satisfied: pandas>=1.0 in /usr/local/lib/python3.7/dist-packages (from arch) (1.1.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2.8.2)
Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=1.0->arch) (2018.9)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=1.0->arch) (1.15.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.11->arch) (0.5.2)
In [915]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from scipy import stats

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

import ipywidgets as widgets
from IPython.display import display
In [2023]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected SENS
In [2024]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv')
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv')
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv')
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv')
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv')
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv')
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv')
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv')
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv')
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv')
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv')
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv')
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv')
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv')
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv')
In [2025]:
pd.set_option('display.max_colwidth', None)
In [2026]:
df['Date'] = df['Date'].astype("datetime64[ns]")
In [2027]:
del df['Unnamed: 0']
In [2028]:
df.head(5)
Out[2028]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet
0 2016-03-28 2.88 2.929 2.630 2.75 2.75 111000 1.476013 1.107503 0.053127 0.227108 3.362412 2.440445 2.901429 NaN 8.258460 0.299 45.765037 NaN NaN NaN -0.70 NaN -0.202899 30.602070 NaN NaN 31.087169 32.911863 -182475.012134 -82100.824059 -1141200.0 0.0 1600.000000 0.0 0.0 0.0 0.0 0.0 1600.000000 1600.000000 1600.000000 1600.000000 1600.000000 0 3 3 3 3 3 3 3
1 2016-03-29 2.80 2.820 2.610 2.75 2.75 23100 0.000000 1.259997 0.003412 0.224664 2.918257 2.684600 2.801429 NaN 8.169590 0.210 47.163147 NaN NaN NaN -0.70 NaN -0.202899 30.602070 NaN NaN 27.859244 30.604078 -174774.994650 -76501.882744 -1141200.0 0.0 255261.024903 0.0 0.0 0.0 0.0 0.0 255261.024903 255261.024903 255261.024903 255261.024903 255261.024903 1 8 9 9 9 9 9 9
2 2016-03-30 2.90 2.960 2.804 2.85 2.85 297200 3.636360 0.482263 0.003412 0.222569 2.918257 2.684600 2.801429 NaN 7.809435 0.210 21.569756 NaN NaN NaN -0.40 NaN -0.123077 37.607071 NaN NaN 33.333333 30.759916 -296703.339538 -106052.072602 -844000.0 0.0 58496.727393 0.0 0.0 0.0 0.0 0.0 58496.727393 58496.727393 58496.727393 58496.727393 58496.727393 1 2 3 3 3 3 3 3
3 2016-03-31 2.93 2.950 2.830 2.86 2.86 412300 0.350877 0.855288 0.003563 0.207916 2.922243 2.683471 2.802857 NaN 7.269798 0.120 21.569756 NaN NaN NaN -0.39 NaN -0.120000 38.278015 NaN NaN 48.316486 36.503021 -502853.749120 -174093.001266 -431700.0 0.0 176634.669728 0.0 0.0 0.0 0.0 0.0 176634.669728 176634.669728 176634.669728 176634.669728 176634.669728 3 9 12 12 12 12 12 12
4 2016-04-01 2.95 2.960 2.850 2.92 2.92 29900 2.097909 0.855980 0.005253 0.193928 2.960670 2.670758 2.815714 NaN 6.641377 0.110 19.548425 NaN NaN NaN -0.53 NaN -0.153623 42.288102 NaN NaN 68.749389 50.133069 -494699.156534 -183506.647054 -401800.0 0.0 60647.000000 0.0 0.0 0.0 0.0 0.0 60647.000000 60647.000000 60647.000000 60647.000000 60647.000000 1 3 4 4 4 4 4 4
In [2029]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1319 entries, 0 to 1318
Data columns (total 52 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       1319 non-null   datetime64[ns]
 1   Open                       1319 non-null   float64       
 2   High                       1319 non-null   float64       
 3   Low                        1319 non-null   float64       
 4   Close                      1319 non-null   float64       
 5   Adj Close                  1319 non-null   float64       
 6   Volume                     1319 non-null   int64         
 7   Return                     1319 non-null   float64       
 8   Beta                       1319 non-null   float64       
 9   Variance                   1319 non-null   float64       
 10  AvgTrueRange               1319 non-null   float64       
 11  Upperband                  1319 non-null   float64       
 12  Lowerband                  1319 non-null   float64       
 13  Middleband                 1319 non-null   float64       
 14  APO                        1313 non-null   float64       
 15  NATR                       1319 non-null   float64       
 16  TRANGE                     1319 non-null   float64       
 17  DMI                        1319 non-null   float64       
 18  MACD                       1305 non-null   float64       
 19  MACDSIGNAL                 1305 non-null   float64       
 20  MACDHIST                   1305 non-null   float64       
 21  MOM                        1319 non-null   float64       
 22  PPO                        1313 non-null   float64       
 23  ROCP                       1319 non-null   float64       
 24  RSI                        1319 non-null   float64       
 25  TRIX                       1255 non-null   float64       
 26  ULTOSC                     1310 non-null   float64       
 27  SLOWK                      1319 non-null   float64       
 28  SLOWD                      1319 non-null   float64       
 29  AD                         1319 non-null   float64       
 30  ADOSC                      1319 non-null   float64       
 31  OBV                        1319 non-null   float64       
 32  Upward_momentum_created    1319 non-null   float64       
 33  Downward_momentum_created  1319 non-null   float64       
 34  B5_O_Um                    1319 non-null   float64       
 35  B5_C_Um                    1319 non-null   float64       
 36  B5_E_Um                    1319 non-null   float64       
 37  B5_A_Um                    1319 non-null   float64       
 38  B5_N_Um                    1319 non-null   float64       
 39  B5_O_Dm                    1319 non-null   float64       
 40  B5_C_Dm                    1319 non-null   float64       
 41  B5_E_Dm                    1319 non-null   float64       
 42  B5_A_Dm                    1319 non-null   float64       
 43  B5_N_Dm                    1319 non-null   float64       
 44  Verified_status_True       1319 non-null   int64         
 45  Verified_status_False      1319 non-null   int64         
 46  O                          1319 non-null   int64         
 47  C                          1319 non-null   int64         
 48  E                          1319 non-null   int64         
 49  A                          1319 non-null   int64         
 50  N                          1319 non-null   int64         
 51  Real_or_Fake_tweet         1319 non-null   int64         
dtypes: datetime64[ns](1), float64(42), int64(9)
memory usage: 536.0 KB
In [2030]:
df.shape
Out[2030]:
(1319, 52)
In [2031]:
sns.set(font_scale=0.8)
In [2032]:
# CHANGE CONTEXT TO poster TO INCREASE FONT SIZES
sns.set_context("talk", font_scale=1.3)

# PLOT OUT BTC-USE'S CLOSING PRICES SINCE 2014
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(18,8))
    sns.lineplot(x=df.Date, y=df.Close, color='blue')
    ax.set_title('Closing Price')    
In [2033]:
# CALCULATE PRICE RETURNS AS DAILY PERCENTAGE CHANGE USING pct_change()
df['returns'] = 100 * df.Close.pct_change().dropna()
In [2034]:
# CALCULATE LOG RETURNS BASED ON ABOVE FORMULA
df['log_returns'] = np.log(df.Close/df.Close.shift(1))
In [2035]:
df.head()
Out[2035]:
Date Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns
0 2016-03-28 2.88 2.929 2.630 2.75 2.75 111000 1.476013 1.107503 0.053127 0.227108 3.362412 2.440445 2.901429 NaN 8.258460 0.299 45.765037 NaN NaN NaN -0.70 NaN -0.202899 30.602070 NaN NaN 31.087169 32.911863 -182475.012134 -82100.824059 -1141200.0 0.0 1600.000000 0.0 0.0 0.0 0.0 0.0 1600.000000 1600.000000 1600.000000 1600.000000 1600.000000 0 3 3 3 3 3 3 3 NaN NaN
1 2016-03-29 2.80 2.820 2.610 2.75 2.75 23100 0.000000 1.259997 0.003412 0.224664 2.918257 2.684600 2.801429 NaN 8.169590 0.210 47.163147 NaN NaN NaN -0.70 NaN -0.202899 30.602070 NaN NaN 27.859244 30.604078 -174774.994650 -76501.882744 -1141200.0 0.0 255261.024903 0.0 0.0 0.0 0.0 0.0 255261.024903 255261.024903 255261.024903 255261.024903 255261.024903 1 8 9 9 9 9 9 9 0.000000 0.000000
2 2016-03-30 2.90 2.960 2.804 2.85 2.85 297200 3.636360 0.482263 0.003412 0.222569 2.918257 2.684600 2.801429 NaN 7.809435 0.210 21.569756 NaN NaN NaN -0.40 NaN -0.123077 37.607071 NaN NaN 33.333333 30.759916 -296703.339538 -106052.072602 -844000.0 0.0 58496.727393 0.0 0.0 0.0 0.0 0.0 58496.727393 58496.727393 58496.727393 58496.727393 58496.727393 1 2 3 3 3 3 3 3 3.636360 0.035718
3 2016-03-31 2.93 2.950 2.830 2.86 2.86 412300 0.350877 0.855288 0.003563 0.207916 2.922243 2.683471 2.802857 NaN 7.269798 0.120 21.569756 NaN NaN NaN -0.39 NaN -0.120000 38.278015 NaN NaN 48.316486 36.503021 -502853.749120 -174093.001266 -431700.0 0.0 176634.669728 0.0 0.0 0.0 0.0 0.0 176634.669728 176634.669728 176634.669728 176634.669728 176634.669728 3 9 12 12 12 12 12 12 0.350877 0.003503
4 2016-04-01 2.95 2.960 2.850 2.92 2.92 29900 2.097909 0.855980 0.005253 0.193928 2.960670 2.670758 2.815714 NaN 6.641377 0.110 19.548425 NaN NaN NaN -0.53 NaN -0.153623 42.288102 NaN NaN 68.749389 50.133069 -494699.156534 -183506.647054 -401800.0 0.0 60647.000000 0.0 0.0 0.0 0.0 0.0 60647.000000 60647.000000 60647.000000 60647.000000 60647.000000 1 3 4 4 4 4 4 4 2.097909 0.020762
In [2036]:
# DROPPING THE 1ST ROW OF DATA 
# BECAUSE I SHIFTED IT FORWARD TO CALCULATE RETURNS/LOG RETURNS
df.dropna(inplace=True)
In [2037]:
# PLOT DISTRIBUTION PLOTS OF RETURNS & LOG RETURNS
# AND VISUALLY COMPARE THEM WITH THE STANDARD NORMAL DISTRIBUTION
with sns.axes_style("darkgrid"):
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,12))

    axes[0][0].plot(df.returns, color='blue')
    axes[0][0].set_title('Returns')

    sns.distplot(df.returns, norm_hist=True, fit=stats.norm, color='blue',
                bins=50, ax=axes[0][1])
    axes[0][1].set_title('Returns')

    axes[1][0].plot(df.log_returns, color='green')
    axes[1][0].set_title('Log Returns')

    sns.distplot(df.log_returns, norm_hist=True, fit=stats.norm, color='green',
                bins=50, ax=axes[1][1])
    axes[1][1].set_title('Log Returns')
    plt.tight_layout()
    fig.show();
In [2038]:
# CREATE A FUNCTION THAT CALCULATE REALIZED VOLATILITY
# FROM SAILY LOG RETURNS
def realized_volatility_daily(series_log_return):
    """
    Get the daily realized volatility which is calculated as the square root
    of sum of squares of log returns within a specific window interval 
    """
    n = len(series_log_return)
    return np.sqrt(np.sum(series_log_return**2)/(n - 1))
In [2039]:
intervals = [7, 30, 60, 180, 365]
vols_df = {}

# ITERATE OVER intervals LIST
for i in intervals:
    # GET DAILY LOG RETURNS USING THAT INTERVAL
    vols = df.log_returns.rolling(window=i)\
                         .apply(realized_volatility_daily).values

    vols_df[i] = vols

# CONVERT vols_df FROM DICTIONARY TO PANDAS DATAFRAME
vols_df = pd.DataFrame(vols_df, columns=intervals, index=df.index)
In [2040]:
# CHANGING MATPLOTLIB STYLE
plt.style.use(['fivethirtyeight'])

fig, ax = plt.subplots(figsize=(18,7))

for i in intervals:
    if i == 7:
        alpha = 0.5
        lw = 1
    else:
        alpha = 1.0
        lw = 2
    ax.plot(vols_df[i], label=f'{i}-Day Interval Realized Volatility', 
            alpha=alpha, lw=lw)

ax.set_title('Realized Volatility Using Different Interval Windows', fontsize=21)

plt.legend(loc='best', prop={'size': 14})
plt.show();
In [2041]:
INTERVAL_WINDOW = 30
n_future = 7

# GET BACKWARD LOOKING REALIZED VOLATILITY
df['vol_current'] = df.log_returns.rolling(window=INTERVAL_WINDOW)\
                                   .apply(realized_volatility_daily)

# GET FORWARD LOOKING REALIZED VOLATILITY 
df['vol_future'] = df.log_returns.shift(-n_future)\
                                 .rolling(window=INTERVAL_WINDOW)\
                                 .apply(realized_volatility_daily)
In [2042]:
df.describe()
Out[2042]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Real_or_Fake_tweet returns log_returns vol_current vol_future
count 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1.255000e+03 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1.255000e+03 1.255000e+03 1.255000e+03 1255.0 1.255000e+03 1255.0 1255.0 1255.0 1255.0 1255.0 1.255000e+03 1.255000e+03 1.255000e+03 1.255000e+03 1.255000e+03 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1226.000000 1219.000000
mean 2.330097 2.406209 2.239442 2.317884 2.317884 6.635074e+06 0.176641 0.585179 0.021789 0.171681 2.533256 2.101356 2.317306 0.000936 7.860622 0.173375 36.845161 0.002835 0.003216 -0.000380 0.003543 0.174178 0.026092 48.524431 0.032245 46.636198 44.537322 44.656067 -4.900436e+07 -3.394747e+05 4.417619e+07 0.0 2.066846e+05 0.0 0.0 0.0 0.0 0.0 2.066846e+05 2.066846e+05 2.066846e+05 2.066846e+05 2.066846e+05 0.110757 26.653386 26.764143 26.764143 26.764143 26.764143 26.764143 26.764143 0.143503 -0.000324 0.052559 0.052498
std 1.202882 1.236201 1.162810 1.197414 1.197414 2.109323e+07 6.188521 0.548392 0.050890 0.108076 1.299816 1.104715 1.189288 0.219368 3.477244 0.149710 23.979419 0.149755 0.138914 0.049113 0.447930 10.391616 0.255113 13.153367 0.868821 10.495286 24.346796 22.738245 5.190953e+07 7.276898e+06 1.939866e+08 0.0 5.641543e+05 0.0 0.0 0.0 0.0 0.0 5.641543e+05 5.641543e+05 5.641543e+05 5.641543e+05 5.641543e+05 0.490422 68.991055 69.315209 69.315209 69.315209 69.315209 69.315209 69.315209 6.251351 0.058067 0.026992 0.027057
min 0.350000 0.373000 0.350000 0.358000 0.358000 7.300000e+03 -23.214286 -4.224136 0.000007 0.017976 0.376821 -0.057084 0.367571 -0.611218 2.467271 0.008000 0.009490 -0.354178 -0.315768 -0.281754 -2.150000 -36.420574 -0.602564 14.004668 -1.083681 16.265550 2.499993 4.038471 -3.134058e+08 -4.694227e+07 -1.152523e+08 0.0 2.000000e+00 0.0 0.0 0.0 0.0 0.0 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 -23.214286 -0.264152 0.022862 0.022862
25% 1.100000 1.130000 1.050000 1.090000 1.090000 6.845000e+05 -2.781300 0.280277 0.001569 0.094727 1.191447 1.006744 1.097857 -0.114231 5.836999 0.079500 17.088507 -0.075172 -0.072945 -0.024025 -0.207500 -5.564701 -0.102077 39.650880 -0.382744 39.375001 23.419042 25.174841 -9.153377e+07 -1.320209e+06 -7.657110e+07 0.0 1.617070e+04 0.0 0.0 0.0 0.0 0.0 1.617070e+04 1.617070e+04 1.617070e+04 1.617070e+04 1.617070e+04 0.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 -2.832621 -0.028735 0.037376 0.037350
50% 2.600000 2.680000 2.500000 2.600000 2.600000 1.452600e+06 -0.421941 0.544773 0.006727 0.165594 2.791583 2.361585 2.617143 -0.014840 7.044222 0.150000 33.553032 -0.020216 -0.018194 0.001876 -0.020000 -1.079892 -0.013458 47.175145 -0.083783 47.038902 42.010855 41.983836 -2.992367e+07 -1.452031e+05 -1.114930e+07 0.0 5.027840e+04 0.0 0.0 0.0 0.0 0.0 5.027840e+04 5.027840e+04 5.027840e+04 5.027840e+04 5.027840e+04 0.000000 7.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 -0.476190 -0.004773 0.043877 0.043821
75% 3.200000 3.295000 3.085000 3.185000 3.185000 3.627350e+06 2.261439 0.837349 0.020341 0.215851 3.461765 2.947799 3.158571 0.075910 8.659193 0.220000 53.706705 0.062421 0.059647 0.021164 0.170000 4.226780 0.084865 56.892430 0.235982 53.678881 65.104131 63.492320 -7.424962e+05 3.605157e+05 -8.176000e+05 0.0 1.855109e+05 0.0 0.0 0.0 0.0 0.0 1.855109e+05 1.855109e+05 1.855109e+05 1.855109e+05 1.855109e+05 0.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 2.258400 0.022333 0.056159 0.055970
max 5.560000 5.560000 4.810000 5.270000 5.270000 4.099237e+08 80.379736 6.368360 0.635678 0.748061 5.405548 4.359251 4.704286 0.895167 30.633581 1.500000 97.729638 0.819603 0.727997 0.208247 2.580000 52.322199 2.402538 95.055174 4.818767 87.304053 98.068250 96.255002 5.428837e+07 6.727516e+07 8.077482e+08 0.0 1.054658e+07 0.0 0.0 0.0 0.0 0.0 1.054658e+07 1.054658e+07 1.054658e+07 1.054658e+07 1.054658e+07 6.000000 996.000000 1001.000000 1001.000000 1001.000000 1001.000000 1001.000000 1001.000000 80.379736 0.589894 0.180438 0.180438
In [2043]:
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [2044]:
df = df.fillna(df.median())
In [2045]:
df.isna().sum()
Out[2045]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [2046]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1255 entries, 64 to 1318
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   Date                       1255 non-null   datetime64[ns]
 1   Open                       1255 non-null   float64       
 2   High                       1255 non-null   float64       
 3   Low                        1255 non-null   float64       
 4   Close                      1255 non-null   float64       
 5   Adj Close                  1255 non-null   float64       
 6   Volume                     1255 non-null   int64         
 7   Return                     1255 non-null   float64       
 8   Beta                       1255 non-null   float64       
 9   Variance                   1255 non-null   float64       
 10  AvgTrueRange               1255 non-null   float64       
 11  Upperband                  1255 non-null   float64       
 12  Lowerband                  1255 non-null   float64       
 13  Middleband                 1255 non-null   float64       
 14  APO                        1255 non-null   float64       
 15  NATR                       1255 non-null   float64       
 16  TRANGE                     1255 non-null   float64       
 17  DMI                        1255 non-null   float64       
 18  MACD                       1255 non-null   float64       
 19  MACDSIGNAL                 1255 non-null   float64       
 20  MACDHIST                   1255 non-null   float64       
 21  MOM                        1255 non-null   float64       
 22  PPO                        1255 non-null   float64       
 23  ROCP                       1255 non-null   float64       
 24  RSI                        1255 non-null   float64       
 25  TRIX                       1255 non-null   float64       
 26  ULTOSC                     1255 non-null   float64       
 27  SLOWK                      1255 non-null   float64       
 28  SLOWD                      1255 non-null   float64       
 29  AD                         1255 non-null   float64       
 30  ADOSC                      1255 non-null   float64       
 31  OBV                        1255 non-null   float64       
 32  Upward_momentum_created    1255 non-null   float64       
 33  Downward_momentum_created  1255 non-null   float64       
 34  B5_O_Um                    1255 non-null   float64       
 35  B5_C_Um                    1255 non-null   float64       
 36  B5_E_Um                    1255 non-null   float64       
 37  B5_A_Um                    1255 non-null   float64       
 38  B5_N_Um                    1255 non-null   float64       
 39  B5_O_Dm                    1255 non-null   float64       
 40  B5_C_Dm                    1255 non-null   float64       
 41  B5_E_Dm                    1255 non-null   float64       
 42  B5_A_Dm                    1255 non-null   float64       
 43  B5_N_Dm                    1255 non-null   float64       
 44  Verified_status_True       1255 non-null   int64         
 45  Verified_status_False      1255 non-null   int64         
 46  O                          1255 non-null   int64         
 47  C                          1255 non-null   int64         
 48  E                          1255 non-null   int64         
 49  A                          1255 non-null   int64         
 50  N                          1255 non-null   int64         
 51  Fake_news                  1255 non-null   int64         
 52  returns                    1255 non-null   float64       
 53  log_returns                1255 non-null   float64       
 54  vol_current                1255 non-null   float64       
 55  vol_future                 1255 non-null   float64       
dtypes: datetime64[ns](1), float64(46), int64(9)
memory usage: 558.9 KB
In [2047]:
df.shape
Out[2047]:
(1255, 56)
In [2048]:
df=df.dropna()
In [2049]:
df.dtypes
Out[2049]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [2050]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(40,15))
sns.heatmap(df.corr(),annot=True)
Out[2050]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f076d1cb410>
In [2051]:
df.hist(figsize=(20, 32), bins=70, xlabelsize=8, ylabelsize=8);
In [2052]:
df_corr = df.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with AvgTrueRange:
AvgTrueRange             1.000000
TRIX                     0.773228
TRANGE                   0.767160
Upperband                0.746421
High                     0.709980
Open                     0.687844
Adj Close                0.686117
Close                    0.686117
Middleband               0.679181
Low                      0.662160
Variance                 0.655414
OBV                      0.600346
Lowerband                0.584109
MACDSIGNAL               0.549166
vol_current              0.546503
Verified_status_False    0.541082
O                        0.540786
C                        0.540786
Fake_news                0.540786
E                        0.540786
A                        0.540786
N                        0.540786
Name: AvgTrueRange, dtype: float64
In [2053]:
df_corr = df.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 3 strongly correlated values with NATR :
NATR           1.000000
vol_current    0.764298
vol_future     0.749678
Name: NATR, dtype: float64
In [2054]:
df_corr = df.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 24 strongly correlated values with TRANGE:
TRANGE                       1.000000
AvgTrueRange                 0.767160
Fake_news                    0.680228
N                            0.680228
A                            0.680228
E                            0.680228
C                            0.680228
O                            0.680228
Verified_status_False        0.679726
Variance                     0.656737
Volume                       0.652657
Downward_momentum_created    0.609455
B5_O_Dm                      0.609455
B5_C_Dm                      0.609455
B5_E_Dm                      0.609455
B5_A_Dm                      0.609455
B5_N_Dm                      0.609455
TRIX                         0.564208
High                         0.550156
Upperband                    0.546211
Verified_status_True         0.520266
Adj Close                    0.516016
Close                        0.516016
Open                         0.503653
Name: TRANGE, dtype: float64
In [2055]:
df_corr = df.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with Openness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999986
B5_N_Dm                      0.908342
B5_C_Dm                      0.908342
Downward_momentum_created    0.908342
B5_O_Dm                      0.908342
B5_A_Dm                      0.908342
B5_E_Dm                      0.908342
Volume                       0.905358
TRANGE                       0.680228
Verified_status_True         0.662961
Variance                     0.657800
AvgTrueRange                 0.540786
vol_current                  0.540088
vol_future                   0.528364
OBV                          0.520375
Name: O, dtype: float64
In [2056]:
df_corr = df.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999986
B5_N_Dm                      0.908342
B5_C_Dm                      0.908342
Downward_momentum_created    0.908342
B5_O_Dm                      0.908342
B5_A_Dm                      0.908342
B5_E_Dm                      0.908342
Volume                       0.905358
TRANGE                       0.680228
Verified_status_True         0.662961
Variance                     0.657800
AvgTrueRange                 0.540786
vol_current                  0.540088
vol_future                   0.528364
OBV                          0.520375
Name: C, dtype: float64
In [2057]:
df_corr = df.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999986
B5_N_Dm                      0.908342
B5_C_Dm                      0.908342
Downward_momentum_created    0.908342
B5_O_Dm                      0.908342
B5_A_Dm                      0.908342
B5_E_Dm                      0.908342
Volume                       0.905358
TRANGE                       0.680228
Verified_status_True         0.662961
Variance                     0.657800
AvgTrueRange                 0.540786
vol_current                  0.540088
vol_future                   0.528364
OBV                          0.520375
Name: E, dtype: float64
In [2058]:
df_corr = df.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999986
B5_N_Dm                      0.908342
B5_C_Dm                      0.908342
Downward_momentum_created    0.908342
B5_O_Dm                      0.908342
B5_A_Dm                      0.908342
B5_E_Dm                      0.908342
Volume                       0.905358
TRANGE                       0.680228
Verified_status_True         0.662961
Variance                     0.657800
AvgTrueRange                 0.540786
vol_current                  0.540088
vol_future                   0.528364
OBV                          0.520375
Name: A, dtype: float64
In [2059]:
df_corr = df.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with conscientiousness:
Fake_news                    1.000000
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999986
B5_N_Dm                      0.908342
B5_C_Dm                      0.908342
Downward_momentum_created    0.908342
B5_O_Dm                      0.908342
B5_A_Dm                      0.908342
B5_E_Dm                      0.908342
Volume                       0.905358
TRANGE                       0.680228
Verified_status_True         0.662961
Variance                     0.657800
AvgTrueRange                 0.540786
vol_current                  0.540088
vol_future                   0.528364
OBV                          0.520375
Name: N, dtype: float64
In [2060]:
df.columns
Out[2060]:
Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Return',
       'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Fake_news', 'returns', 'log_returns', 'vol_current',
       'vol_future'],
      dtype='object')
In [2061]:
df_corr = df.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [2062]:
df_corr = df.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [2063]:
df_corr = df.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [2064]:
df_corr = df.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [2065]:
df_corr = df.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [2066]:
df_corr = df.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with B5_O_Dm:
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
B5_A_Dm                      1.000000
N                            0.908342
Fake_news                    0.908342
O                            0.908342
C                            0.908342
E                            0.908342
A                            0.908342
Verified_status_False        0.907616
Volume                       0.847084
Verified_status_True         0.702571
TRANGE                       0.609455
Variance                     0.529365
Name: B5_O_Dm, dtype: float64
In [2067]:
df_corr = df.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with B5_C_Dm:
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
B5_A_Dm                      1.000000
N                            0.908342
Fake_news                    0.908342
O                            0.908342
C                            0.908342
E                            0.908342
A                            0.908342
Verified_status_False        0.907616
Volume                       0.847084
Verified_status_True         0.702571
TRANGE                       0.609455
Variance                     0.529365
Name: B5_C_Dm, dtype: float64
In [2068]:
df_corr = df.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with B5_E_Dm:
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
B5_A_Dm                      1.000000
N                            0.908342
Fake_news                    0.908342
O                            0.908342
C                            0.908342
E                            0.908342
A                            0.908342
Verified_status_False        0.907616
Volume                       0.847084
Verified_status_True         0.702571
TRANGE                       0.609455
Variance                     0.529365
Name: B5_E_Dm, dtype: float64
In [2069]:
df_corr = df.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with B5_A_Dm:
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
B5_A_Dm                      1.000000
N                            0.908342
Fake_news                    0.908342
O                            0.908342
C                            0.908342
E                            0.908342
A                            0.908342
Verified_status_False        0.907616
Volume                       0.847084
Verified_status_True         0.702571
TRANGE                       0.609455
Variance                     0.529365
Name: B5_A_Dm, dtype: float64
In [2070]:
df_corr = df.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with B5_N_Dm:
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
B5_A_Dm                      1.000000
N                            0.908342
Fake_news                    0.908342
O                            0.908342
C                            0.908342
E                            0.908342
A                            0.908342
Verified_status_False        0.907616
Volume                       0.847084
Verified_status_True         0.702571
TRANGE                       0.609455
Variance                     0.529365
Name: B5_N_Dm, dtype: float64
In [2071]:
df_corr = df.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
N                            1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
Verified_status_False        0.999986
B5_N_Dm                      0.908342
B5_C_Dm                      0.908342
Downward_momentum_created    0.908342
B5_O_Dm                      0.908342
B5_A_Dm                      0.908342
B5_E_Dm                      0.908342
Volume                       0.905358
TRANGE                       0.680228
Verified_status_True         0.662961
Variance                     0.657800
AvgTrueRange                 0.540786
vol_current                  0.540088
vol_future                   0.528364
OBV                          0.520375
Name: Fake_news, dtype: float64
In [2072]:
df_corr = df.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Downward_momentum_created :
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
B5_A_Dm                      1.000000
N                            0.908342
Fake_news                    0.908342
O                            0.908342
C                            0.908342
E                            0.908342
A                            0.908342
Verified_status_False        0.907616
Volume                       0.847084
Verified_status_True         0.702571
TRANGE                       0.609455
Variance                     0.529365
Name: Downward_momentum_created, dtype: float64
In [2073]:
df_corr = df.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [2074]:
df_corr = df.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 16 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_N_Dm                      0.702571
B5_A_Dm                      0.702571
B5_E_Dm                      0.702571
B5_C_Dm                      0.702571
B5_O_Dm                      0.702571
Downward_momentum_created    0.702571
Fake_news                    0.662961
N                            0.662961
A                            0.662961
E                            0.662961
C                            0.662961
O                            0.662961
Verified_status_False        0.658967
Volume                       0.652949
TRANGE                       0.520266
Name: Verified_status_True, dtype: float64
In [2075]:
df_corr = df.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 21 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999986
N                            0.999986
A                            0.999986
E                            0.999986
C                            0.999986
O                            0.999986
B5_N_Dm                      0.907616
B5_C_Dm                      0.907616
Downward_momentum_created    0.907616
B5_O_Dm                      0.907616
B5_A_Dm                      0.907616
B5_E_Dm                      0.907616
Volume                       0.904971
TRANGE                       0.679726
Verified_status_True         0.658967
Variance                     0.657647
AvgTrueRange                 0.541082
vol_current                  0.540399
vol_future                   0.528489
OBV                          0.521090
Name: Verified_status_False, dtype: float64
In [2076]:
sns.set(font_scale=0.8)
In [2077]:
for i in range(0, len(df.columns), 5):
    sns.pairplot(data=df,
                x_vars=df.columns[i:i+5],
                y_vars=['NATR'])
In [2078]:
df.dtypes
Out[2078]:
Date                         datetime64[ns]
Open                                float64
High                                float64
Low                                 float64
Close                               float64
Adj Close                           float64
Volume                                int64
Return                              float64
Beta                                float64
Variance                            float64
AvgTrueRange                        float64
Upperband                           float64
Lowerband                           float64
Middleband                          float64
APO                                 float64
NATR                                float64
TRANGE                              float64
DMI                                 float64
MACD                                float64
MACDSIGNAL                          float64
MACDHIST                            float64
MOM                                 float64
PPO                                 float64
ROCP                                float64
RSI                                 float64
TRIX                                float64
ULTOSC                              float64
SLOWK                               float64
SLOWD                               float64
AD                                  float64
ADOSC                               float64
OBV                                 float64
Upward_momentum_created             float64
Downward_momentum_created           float64
B5_O_Um                             float64
B5_C_Um                             float64
B5_E_Um                             float64
B5_A_Um                             float64
B5_N_Um                             float64
B5_O_Dm                             float64
B5_C_Dm                             float64
B5_E_Dm                             float64
B5_A_Dm                             float64
B5_N_Dm                             float64
Verified_status_True                  int64
Verified_status_False                 int64
O                                     int64
C                                     int64
E                                     int64
A                                     int64
N                                     int64
Fake_news                             int64
returns                             float64
log_returns                         float64
vol_current                         float64
vol_future                          float64
dtype: object
In [2079]:
df.isnull().sum()
Out[2079]:
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
Adj Close                    0
Volume                       0
Return                       0
Beta                         0
Variance                     0
AvgTrueRange                 0
Upperband                    0
Lowerband                    0
Middleband                   0
APO                          0
NATR                         0
TRANGE                       0
DMI                          0
MACD                         0
MACDSIGNAL                   0
MACDHIST                     0
MOM                          0
PPO                          0
ROCP                         0
RSI                          0
TRIX                         0
ULTOSC                       0
SLOWK                        0
SLOWD                        0
AD                           0
ADOSC                        0
OBV                          0
Upward_momentum_created      0
Downward_momentum_created    0
B5_O_Um                      0
B5_C_Um                      0
B5_E_Um                      0
B5_A_Um                      0
B5_N_Um                      0
B5_O_Dm                      0
B5_C_Dm                      0
B5_E_Dm                      0
B5_A_Dm                      0
B5_N_Dm                      0
Verified_status_True         0
Verified_status_False        0
O                            0
C                            0
E                            0
A                            0
N                            0
Fake_news                    0
returns                      0
log_returns                  0
vol_current                  0
vol_future                   0
dtype: int64
In [2080]:
df.fillna(0, inplace = True)
In [2081]:
df.dropna(inplace=True)
In [2082]:
sns.set(font_scale=0.8)
In [2083]:
corr = df.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);
In [2084]:
df.describe()
Out[2084]:
Open High Low Close Adj Close Volume Return Beta Variance AvgTrueRange Upperband Lowerband Middleband APO NATR TRANGE DMI MACD MACDSIGNAL MACDHIST MOM PPO ROCP RSI TRIX ULTOSC SLOWK SLOWD AD ADOSC OBV Upward_momentum_created Downward_momentum_created B5_O_Um B5_C_Um B5_E_Um B5_A_Um B5_N_Um B5_O_Dm B5_C_Dm B5_E_Dm B5_A_Dm B5_N_Dm Verified_status_True Verified_status_False O C E A N Fake_news returns log_returns vol_current vol_future
count 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1.255000e+03 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1.255000e+03 1.255000e+03 1.255000e+03 1255.0 1.255000e+03 1255.0 1255.0 1255.0 1255.0 1255.0 1.255000e+03 1.255000e+03 1.255000e+03 1.255000e+03 1.255000e+03 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000 1255.000000
mean 2.330097 2.406209 2.239442 2.317884 2.317884 6.635074e+06 0.176641 0.585179 0.021789 0.171681 2.533256 2.101356 2.317306 0.000936 7.860622 0.173375 36.845161 0.002835 0.003216 -0.000380 0.003543 0.174178 0.026092 48.524431 0.032245 46.636198 44.537322 44.656067 -4.900436e+07 -3.394747e+05 4.417619e+07 0.0 2.066846e+05 0.0 0.0 0.0 0.0 0.0 2.066846e+05 2.066846e+05 2.066846e+05 2.066846e+05 2.066846e+05 0.110757 26.653386 26.764143 26.764143 26.764143 26.764143 26.764143 26.764143 0.143503 -0.000324 0.052359 0.052249
std 1.202882 1.236201 1.162810 1.197414 1.197414 2.109323e+07 6.188521 0.548392 0.050890 0.108076 1.299816 1.104715 1.189288 0.219368 3.477244 0.149710 23.979419 0.149755 0.138914 0.049113 0.447930 10.391616 0.255113 13.153367 0.868821 10.495286 24.346796 22.738245 5.190953e+07 7.276898e+06 1.939866e+08 0.0 5.641543e+05 0.0 0.0 0.0 0.0 0.0 5.641543e+05 5.641543e+05 5.641543e+05 5.641543e+05 5.641543e+05 0.490422 68.991055 69.315209 69.315209 69.315209 69.315209 69.315209 69.315209 6.251351 0.058067 0.026710 0.026705
min 0.350000 0.373000 0.350000 0.358000 0.358000 7.300000e+03 -23.214286 -4.224136 0.000007 0.017976 0.376821 -0.057084 0.367571 -0.611218 2.467271 0.008000 0.009490 -0.354178 -0.315768 -0.281754 -2.150000 -36.420574 -0.602564 14.004668 -1.083681 16.265550 2.499993 4.038471 -3.134058e+08 -4.694227e+07 -1.152523e+08 0.0 2.000000e+00 0.0 0.0 0.0 0.0 0.0 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 -23.214286 -0.264152 0.022862 0.022862
25% 1.100000 1.130000 1.050000 1.090000 1.090000 6.845000e+05 -2.781300 0.280277 0.001569 0.094727 1.191447 1.006744 1.097857 -0.114231 5.836999 0.079500 17.088507 -0.075172 -0.072945 -0.024025 -0.207500 -5.564701 -0.102077 39.650880 -0.382744 39.375001 23.419042 25.174841 -9.153377e+07 -1.320209e+06 -7.657110e+07 0.0 1.617070e+04 0.0 0.0 0.0 0.0 0.0 1.617070e+04 1.617070e+04 1.617070e+04 1.617070e+04 1.617070e+04 0.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 3.000000 -2.832621 -0.028735 0.037483 0.037483
50% 2.600000 2.680000 2.500000 2.600000 2.600000 1.452600e+06 -0.421941 0.544773 0.006727 0.165594 2.791583 2.361585 2.617143 -0.014840 7.044222 0.150000 33.553032 -0.020216 -0.018194 0.001876 -0.020000 -1.079892 -0.013458 47.175145 -0.083783 47.038902 42.010855 41.983836 -2.992367e+07 -1.452031e+05 -1.114930e+07 0.0 5.027840e+04 0.0 0.0 0.0 0.0 0.0 5.027840e+04 5.027840e+04 5.027840e+04 5.027840e+04 5.027840e+04 0.000000 7.000000 8.000000 8.000000 8.000000 8.000000 8.000000 8.000000 -0.476190 -0.004773 0.043877 0.043821
75% 3.200000 3.295000 3.085000 3.185000 3.185000 3.627350e+06 2.261439 0.837349 0.020341 0.215851 3.461765 2.947799 3.158571 0.075910 8.659193 0.220000 53.706705 0.062421 0.059647 0.021164 0.170000 4.226780 0.084865 56.892430 0.235982 53.678881 65.104131 63.492320 -7.424962e+05 3.605157e+05 -8.176000e+05 0.0 1.855109e+05 0.0 0.0 0.0 0.0 0.0 1.855109e+05 1.855109e+05 1.855109e+05 1.855109e+05 1.855109e+05 0.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 20.000000 2.258400 0.022333 0.055935 0.055219
max 5.560000 5.560000 4.810000 5.270000 5.270000 4.099237e+08 80.379736 6.368360 0.635678 0.748061 5.405548 4.359251 4.704286 0.895167 30.633581 1.500000 97.729638 0.819603 0.727997 0.208247 2.580000 52.322199 2.402538 95.055174 4.818767 87.304053 98.068250 96.255002 5.428837e+07 6.727516e+07 8.077482e+08 0.0 1.054658e+07 0.0 0.0 0.0 0.0 0.0 1.054658e+07 1.054658e+07 1.054658e+07 1.054658e+07 1.054658e+07 6.000000 996.000000 1001.000000 1001.000000 1001.000000 1001.000000 1001.000000 1001.000000 80.379736 0.589894 0.180438 0.180438
In [2085]:
# DROPPING ALL NaN VALUES
df.dropna(inplace=True)
In [2086]:
n_zoom = 365
sns.set_context("talk", font_scale=1.3)
# plt.style.use(['seaborn'])

# VISUALIZE REALIZED CURRENT VS. FUTURE VOLATILITY
with sns.axes_style("whitegrid"):
    fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(18,14))

    ax1.plot(df.vol_current, alpha=.8, lw=1, color='gray', ls=':',
            label='Current Volatility')
    ax1.plot(df.vol_future, lw=1, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax2.plot(df.vol_current[-n_zoom:], alpha=.8, lw=2, color='gray', ls=':',
            label='Current Volatility')
    ax2.plot(df.vol_future[-n_zoom:], lw=2, color='blue',
            label=f'Next {n_future} Days Volatility (TARGET)')

    ax1.title.set_text(f'Future vs. Current Daily Volatility \n Using {INTERVAL_WINDOW}-Day Interval')
    ax2.title.set_text(f'Zooming in the Last {n_zoom} Days')

    ax1.legend(loc='upper left', prop={'size': 13}, frameon=True)
    ax2.legend(loc='upper left', prop={'size': 13}, frameon=True)
    plt.tight_layout()
    
    plt.show();

Daily Volatility Distribution

In [2087]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df.vol_current, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Daily Volatility Distribution')
    
    plt.show();

Experiment 2: weekly granularity

In [2088]:
w = widgets.Dropdown(
    options=['SELECT','AAPL', 'ABUS', 'ARDS', 'BABA','BFRI', 
             'FB', 'GME', 'MCD','PFE', 'PLUG', 
             'QCOM', 'SENS','TSLA', 'TWTR', 'UUUU'],
    value='SELECT',
    description ='Stock name:',

)

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        print("You have selected %s" % change['new'])

w.observe(on_change)

display(w)
You have selected SENS
In [2089]:
if(w.value == 'AAPL'):
  df = pd.read_csv('/content/Final_AAPL.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ABUS'):
  df = pd.read_csv('/content/Final_ABUS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'ARDS'):
  df = pd.read_csv('/content/Final_ARDS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BABA'):
  df = pd.read_csv('/content/Final_BABA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'BFRI'):
  df = pd.read_csv('/content/Final_BFRI.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'FB'):
  df = pd.read_csv('/content/Final_FB.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'GME'):
  df = pd.read_csv('/content/Final_GME.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'MCD'):
  df = pd.read_csv('/content/Final_MCD.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PFE'):
  df = pd.read_csv('/content/Final_PFE.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'PLUG'):
  df = pd.read_csv('/content/Final_PLUG.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'QCOM'):
  df = pd.read_csv('/content/Final_QCOM.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'SENS'):
  df = pd.read_csv('/content/Final_SENS.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TSLA'):
  df = pd.read_csv('/content/Final_TSLA.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'TWTR'):
  df = pd.read_csv('/content/Final_TWTR.csv', parse_dates=['Date'], index_col=['Date'])
if(w.value == 'UUUU'):
  df = pd.read_csv('/content/Final_UUUU.csv', parse_dates=['Date'], index_col=['Date'])
In [2090]:
df.columns
Out[2090]:
Index(['Unnamed: 0', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'Return', 'Beta', 'Variance', 'AvgTrueRange', 'Upperband', 'Lowerband',
       'Middleband', 'APO', 'NATR', 'TRANGE', 'DMI', 'MACD', 'MACDSIGNAL',
       'MACDHIST', 'MOM', 'PPO', 'ROCP', 'RSI', 'TRIX', 'ULTOSC', 'SLOWK',
       'SLOWD', 'AD', 'ADOSC', 'OBV', 'Upward_momentum_created',
       'Downward_momentum_created', 'B5_O_Um', 'B5_C_Um', 'B5_E_Um', 'B5_A_Um',
       'B5_N_Um', 'B5_O_Dm', 'B5_C_Dm', 'B5_E_Dm', 'B5_A_Dm', 'B5_N_Dm',
       'Verified_status_True', 'Verified_status_False', 'O', 'C', 'E', 'A',
       'N', 'Real_or_Fake_tweet'],
      dtype='object')
In [2091]:
df.shape
Out[2091]:
(1319, 52)
In [2092]:
df.isnull().sum()
Out[2092]:
Unnamed: 0                    0
Open                          0
High                          0
Low                           0
Close                         0
Adj Close                     0
Volume                        0
Return                        0
Beta                          0
Variance                      0
AvgTrueRange                  0
Upperband                     0
Lowerband                     0
Middleband                    0
APO                           6
NATR                          0
TRANGE                        0
DMI                           0
MACD                         14
MACDSIGNAL                   14
MACDHIST                     14
MOM                           0
PPO                           6
ROCP                          0
RSI                           0
TRIX                         64
ULTOSC                        9
SLOWK                         0
SLOWD                         0
AD                            0
ADOSC                         0
OBV                           0
Upward_momentum_created       0
Downward_momentum_created     0
B5_O_Um                       0
B5_C_Um                       0
B5_E_Um                       0
B5_A_Um                       0
B5_N_Um                       0
B5_O_Dm                       0
B5_C_Dm                       0
B5_E_Dm                       0
B5_A_Dm                       0
B5_N_Dm                       0
Verified_status_True          0
Verified_status_False         0
O                             0
C                             0
E                             0
A                             0
N                             0
Real_or_Fake_tweet            0
dtype: int64
In [2093]:
df = df.fillna(df.median())
del df['Unnamed: 0']
df.rename(columns = {'Real_or_Fake_tweet': 'Fake_news'}, inplace = True)
In [2094]:
df_weekly = df.resample('W').agg('mean')
In [2095]:
df_weekly.shape
Out[2095]:
(300, 51)
In [2096]:
plt.figure(figsize=(40,15))
sns.heatmap(df_weekly.corr(),annot=True)
Out[2096]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f076e214e10>
In [2097]:
sns.set(font_scale=0.8)
In [2098]:
df_weekly.hist(figsize=(20, 32), bins=50, xlabelsize=8, ylabelsize=8);
In [2099]:
df_corr = df_weekly.corr()['AvgTrueRange'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with AvgTrueRange:\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with AvgTrueRange:
AvgTrueRange             1.000000
TRANGE                   0.896646
TRIX                     0.770304
Upperband                0.726073
High                     0.689917
Open                     0.668045
Variance                 0.666319
Close                    0.665747
Adj Close                0.665747
Middleband               0.656072
Low                      0.642453
N                        0.609856
O                        0.609856
C                        0.609856
E                        0.609856
A                        0.609856
Fake_news                0.609856
Verified_status_False    0.609810
OBV                      0.602616
Lowerband                0.561685
Volume                   0.545195
MACDSIGNAL               0.534273
Name: AvgTrueRange, dtype: float64
In [2100]:
df_corr = df_weekly.corr()['NATR'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with NATR :\n{}".format(len(golden_features_list), golden_features_list))
There are 1 strongly correlated values with NATR :
NATR    1.0
Name: NATR, dtype: float64
In [2101]:
df_corr = df_weekly.corr()['TRANGE'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with TRANGE:\n{}".format(len(golden_features_list), golden_features_list))
There are 29 strongly correlated values with TRANGE:
TRANGE                       1.000000
AvgTrueRange                 0.896646
Variance                     0.794307
O                            0.724065
N                            0.724065
Fake_news                    0.724065
C                            0.724065
E                            0.724065
A                            0.724065
Verified_status_False        0.723646
TRIX                         0.695718
Volume                       0.681548
B5_E_Dm                      0.643234
B5_O_Dm                      0.643234
Downward_momentum_created    0.643234
B5_A_Dm                      0.643234
B5_N_Dm                      0.643234
B5_C_Dm                      0.643234
Upperband                    0.632707
Verified_status_True         0.614594
High                         0.612427
Close                        0.582765
Adj Close                    0.582765
Open                         0.582502
OBV                          0.555656
Low                          0.551307
Middleband                   0.548005
MACD                         0.514582
MACDSIGNAL                   0.513614
Name: TRANGE, dtype: float64
In [2102]:
df_corr = df_weekly.corr()['O'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Openness:\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with Openness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
B5_E_Dm                      0.933486
Downward_momentum_created    0.933486
B5_O_Dm                      0.933486
B5_C_Dm                      0.933486
B5_A_Dm                      0.933486
B5_N_Dm                      0.933486
Volume                       0.926261
Variance                     0.785738
Verified_status_True         0.747742
TRANGE                       0.724065
AvgTrueRange                 0.609856
OBV                          0.583560
TRIX                         0.579919
ROCP                         0.544325
MACD                         0.541994
Name: O, dtype: float64
In [2103]:
df_corr = df_weekly.corr()['C'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
B5_E_Dm                      0.933486
Downward_momentum_created    0.933486
B5_O_Dm                      0.933486
B5_C_Dm                      0.933486
B5_A_Dm                      0.933486
B5_N_Dm                      0.933486
Volume                       0.926261
Variance                     0.785738
Verified_status_True         0.747742
TRANGE                       0.724065
AvgTrueRange                 0.609856
OBV                          0.583560
TRIX                         0.579919
ROCP                         0.544325
MACD                         0.541994
Name: C, dtype: float64
In [2104]:
df_corr = df_weekly.corr()['E'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
B5_E_Dm                      0.933486
Downward_momentum_created    0.933486
B5_O_Dm                      0.933486
B5_C_Dm                      0.933486
B5_A_Dm                      0.933486
B5_N_Dm                      0.933486
Volume                       0.926261
Variance                     0.785738
Verified_status_True         0.747742
TRANGE                       0.724065
AvgTrueRange                 0.609856
OBV                          0.583560
TRIX                         0.579919
ROCP                         0.544325
MACD                         0.541994
Name: E, dtype: float64
In [2105]:
df_corr = df_weekly.corr()['A'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
B5_E_Dm                      0.933486
Downward_momentum_created    0.933486
B5_O_Dm                      0.933486
B5_C_Dm                      0.933486
B5_A_Dm                      0.933486
B5_N_Dm                      0.933486
Volume                       0.926261
Variance                     0.785738
Verified_status_True         0.747742
TRANGE                       0.724065
AvgTrueRange                 0.609856
OBV                          0.583560
TRIX                         0.579919
ROCP                         0.544325
MACD                         0.541994
Name: A, dtype: float64
In [2106]:
df_corr = df_weekly.corr()['N'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with conscientiousness:\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with conscientiousness:
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
B5_E_Dm                      0.933486
Downward_momentum_created    0.933486
B5_O_Dm                      0.933486
B5_C_Dm                      0.933486
B5_A_Dm                      0.933486
B5_N_Dm                      0.933486
Volume                       0.926261
Variance                     0.785738
Verified_status_True         0.747742
TRANGE                       0.724065
AvgTrueRange                 0.609856
OBV                          0.583560
TRIX                         0.579919
ROCP                         0.544325
MACD                         0.541994
Name: N, dtype: float64
In [2107]:
df_corr = df_weekly.corr()['B5_O_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_O_Um:
Series([], Name: B5_O_Um, dtype: float64)
In [2108]:
df_corr = df_weekly.corr()['B5_C_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_C_Um:
Series([], Name: B5_C_Um, dtype: float64)
In [2109]:
df_corr = df_weekly.corr()['B5_E_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_E_Um:
Series([], Name: B5_E_Um, dtype: float64)
In [2110]:
df_corr = df_weekly.corr()['B5_A_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_A_Um:
Series([], Name: B5_A_Um, dtype: float64)
In [2111]:
df_corr = df_weekly.corr()['B5_N_Um'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Um:\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with B5_N_Um:
Series([], Name: B5_N_Um, dtype: float64)

Downward momentum correlation

In [2112]:
df_corr = df_weekly.corr()['B5_O_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_O_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with B5_O_Dm:
B5_A_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
N                            0.933486
Fake_news                    0.933486
O                            0.933486
C                            0.933486
E                            0.933486
A                            0.933486
Verified_status_False        0.933024
Volume                       0.904557
Verified_status_True         0.778478
Variance                     0.712032
TRANGE                       0.643234
ROCP                         0.518690
Return                       0.501675
Name: B5_O_Dm, dtype: float64
In [2113]:
df_corr = df_weekly.corr()['B5_C_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_C_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with B5_C_Dm:
B5_A_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
N                            0.933486
Fake_news                    0.933486
O                            0.933486
C                            0.933486
E                            0.933486
A                            0.933486
Verified_status_False        0.933024
Volume                       0.904557
Verified_status_True         0.778478
Variance                     0.712032
TRANGE                       0.643234
ROCP                         0.518690
Return                       0.501675
Name: B5_C_Dm, dtype: float64
In [2114]:
df_corr = df_weekly.corr()['B5_E_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_E_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with B5_E_Dm:
B5_A_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
N                            0.933486
Fake_news                    0.933486
O                            0.933486
C                            0.933486
E                            0.933486
A                            0.933486
Verified_status_False        0.933024
Volume                       0.904557
Verified_status_True         0.778478
Variance                     0.712032
TRANGE                       0.643234
ROCP                         0.518690
Return                       0.501675
Name: B5_E_Dm, dtype: float64
In [2115]:
df_corr = df_weekly.corr()['B5_A_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_A_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with B5_A_Dm:
B5_A_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
N                            0.933486
Fake_news                    0.933486
O                            0.933486
C                            0.933486
E                            0.933486
A                            0.933486
Verified_status_False        0.933024
Volume                       0.904557
Verified_status_True         0.778478
Variance                     0.712032
TRANGE                       0.643234
ROCP                         0.518690
Return                       0.501675
Name: B5_A_Dm, dtype: float64
In [2116]:
df_corr = df_weekly.corr()['B5_N_Dm'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with B5_N_Dm:\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with B5_N_Dm:
B5_A_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
N                            0.933486
Fake_news                    0.933486
O                            0.933486
C                            0.933486
E                            0.933486
A                            0.933486
Verified_status_False        0.933024
Volume                       0.904557
Verified_status_True         0.778478
Variance                     0.712032
TRANGE                       0.643234
ROCP                         0.518690
Return                       0.501675
Name: B5_N_Dm, dtype: float64
In [2117]:
df_corr = df_weekly.corr()['Fake_news'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Real_or_Fake_tweet :\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with Real_or_Fake_tweet :
Fake_news                    1.000000
A                            1.000000
E                            1.000000
C                            1.000000
O                            1.000000
N                            1.000000
Verified_status_False        0.999993
B5_E_Dm                      0.933486
Downward_momentum_created    0.933486
B5_O_Dm                      0.933486
B5_C_Dm                      0.933486
B5_A_Dm                      0.933486
B5_N_Dm                      0.933486
Volume                       0.926261
Variance                     0.785738
Verified_status_True         0.747742
TRANGE                       0.724065
AvgTrueRange                 0.609856
OBV                          0.583560
TRIX                         0.579919
ROCP                         0.544325
MACD                         0.541994
Name: Fake_news, dtype: float64
In [2118]:
df_corr = df_weekly.corr()['Downward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Downward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 19 strongly correlated values with Downward_momentum_created :
B5_A_Dm                      1.000000
B5_N_Dm                      1.000000
Downward_momentum_created    1.000000
B5_O_Dm                      1.000000
B5_C_Dm                      1.000000
B5_E_Dm                      1.000000
N                            0.933486
Fake_news                    0.933486
O                            0.933486
C                            0.933486
E                            0.933486
A                            0.933486
Verified_status_False        0.933024
Volume                       0.904557
Verified_status_True         0.778478
Variance                     0.712032
TRANGE                       0.643234
ROCP                         0.518690
Return                       0.501675
Name: Downward_momentum_created, dtype: float64
In [2119]:
df_corr = df_weekly.corr()['Upward_momentum_created'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Upward_momentum_created :\n{}".format(len(golden_features_list), golden_features_list))
There are 0 strongly correlated values with Upward_momentum_created :
Series([], Name: Upward_momentum_created, dtype: float64)
In [2120]:
df_corr = df_weekly.corr()['Verified_status_True'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_True :\n{}".format(len(golden_features_list), golden_features_list))
There are 17 strongly correlated values with Verified_status_True :
Verified_status_True         1.000000
B5_N_Dm                      0.778478
Downward_momentum_created    0.778478
B5_O_Dm                      0.778478
B5_C_Dm                      0.778478
B5_E_Dm                      0.778478
B5_A_Dm                      0.778478
Volume                       0.769874
N                            0.747742
O                            0.747742
C                            0.747742
E                            0.747742
A                            0.747742
Fake_news                    0.747742
Verified_status_False        0.745241
Variance                     0.697604
TRANGE                       0.614594
Name: Verified_status_True, dtype: float64
In [2121]:
df_corr = df_weekly.corr()['Verified_status_False'] 
golden_features_list = df_corr[abs(df_corr) > 0.5].sort_values(ascending=False)
print("There are {} strongly correlated values with Verified_status_False :\n{}".format(len(golden_features_list), golden_features_list))
There are 22 strongly correlated values with Verified_status_False :
Verified_status_False        1.000000
Fake_news                    0.999993
A                            0.999993
E                            0.999993
C                            0.999993
O                            0.999993
N                            0.999993
B5_E_Dm                      0.933024
Downward_momentum_created    0.933024
B5_O_Dm                      0.933024
B5_C_Dm                      0.933024
B5_A_Dm                      0.933024
B5_N_Dm                      0.933024
Volume                       0.925817
Variance                     0.785109
Verified_status_True         0.745241
TRANGE                       0.723646
AvgTrueRange                 0.609810
OBV                          0.584174
TRIX                         0.580047
ROCP                         0.544017
MACD                         0.541844
Name: Verified_status_False, dtype: float64
In [2122]:
sns.set(font_scale=0.8)
In [2123]:
for i in range(0, len(df_weekly.columns), 5):
    sns.pairplot(data=df_weekly,
                x_vars=df_weekly.columns[i:i+5],
                y_vars=['NATR'])
In [2124]:
df_weekly.fillna(0, inplace = True)
In [2125]:
df_weekly.dropna(inplace=True)
In [2126]:
corr = df_weekly.drop('Close', axis=1).corr() 
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

Weekly volatility distribution

In [2127]:
with sns.axes_style("darkgrid"):
    fig, ax = plt.subplots(figsize=(10,6))
    sns.distplot(df_weekly.NATR, norm_hist=True, fit=stats.norm,
                bins=50, ax=ax)
    plt.title('Weekly Volatility Distribution')
    
    plt.show();